Análisis de tweets de @UnidosxlaVidaCo

Librerías

Code
# Imports
import pandas as pd
import numpy as np
import spacy
import preprocessor as p
from emoji import demojize
from bertopic import BERTopic
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

Información

Los datos de este usuario cubren desde la creación de la cuenta 2011-06-15 hasta 2023-03-01

Code
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 7830 entries, 171420 to 179249
Data columns (total 64 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   query                    7830 non-null   object        
 1   id                       7830 non-null   float64       
 2   timestamp_utc            7830 non-null   int64         
 3   local_time               7830 non-null   object        
 4   user_screen_name         7830 non-null   object        
 5   text                     7830 non-null   object        
 6   possibly_sensitive       4135 non-null   object        
 7   retweet_count            7830 non-null   float64       
 8   like_count               7830 non-null   float64       
 9   reply_count              7830 non-null   float64       
 10  impression_count         16 non-null     object        
 11  lang                     7830 non-null   object        
 12  to_username              1568 non-null   object        
 13  to_userid                1568 non-null   float64       
 14  to_tweetid               1281 non-null   float64       
 15  source_name              7830 non-null   object        
 16  source_url               7830 non-null   object        
 17  user_location            7830 non-null   object        
 18  lat                      4 non-null      object        
 19  lng                      4 non-null      object        
 20  user_id                  7830 non-null   object        
 21  user_name                7830 non-null   object        
 22  user_verified            7830 non-null   float64       
 23  user_description         7830 non-null   object        
 24  user_url                 7830 non-null   object        
 25  user_image               7830 non-null   object        
 26  user_tweets              7830 non-null   object        
 27  user_followers           7830 non-null   float64       
 28  user_friends             7830 non-null   object        
 29  user_likes               7830 non-null   float64       
 30  user_lists               7830 non-null   float64       
 31  user_created_at          7830 non-null   object        
 32  user_timestamp_utc       7830 non-null   float64       
 33  collected_via            7830 non-null   object        
 34  match_query              7830 non-null   float64       
 35  retweeted_id             0 non-null      float64       
 36  retweeted_user           0 non-null      float64       
 37  retweeted_user_id        0 non-null      float64       
 38  retweeted_timestamp_utc  0 non-null      object        
 39  quoted_id                293 non-null    object        
 40  quoted_user              293 non-null    object        
 41  quoted_user_id           293 non-null    float64       
 42  quoted_timestamp_utc     293 non-null    float64       
 43  collection_time          7830 non-null   object        
 44  url                      7830 non-null   object        
 45  place_country_code       265 non-null    object        
 46  place_name               265 non-null    object        
 47  place_type               265 non-null    object        
 48  place_coordinates        265 non-null    object        
 49  links                    2904 non-null   object        
 50  domains                  2904 non-null   object        
 51  media_urls               1533 non-null   object        
 52  media_files              1533 non-null   object        
 53  media_types              1533 non-null   object        
 54  media_alt_texts          47 non-null     object        
 55  mentioned_names          2767 non-null   object        
 56  mentioned_ids            2613 non-null   object        
 57  hashtags                 4969 non-null   object        
 58  intervention_type        0 non-null      float64       
 59  intervention_text        0 non-null      float64       
 60  intervention_url         0 non-null      float64       
 61  country                  7830 non-null   object        
 62  date                     7830 non-null   datetime64[ns]
 63  time                     7830 non-null   object        
dtypes: datetime64[ns](1), float64(20), int64(1), object(42)
memory usage: 3.9+ MB

Datos

Code
df.head(3)
query id timestamp_utc local_time user_screen_name text possibly_sensitive retweet_count like_count reply_count ... media_alt_texts mentioned_names mentioned_ids hashtags intervention_type intervention_text intervention_url country date time
171420 from:UnidosxlaVidaCo 1.630982e+18 1677691376 2023-03-01T17:22:56 UnidosxlaVidaCo #QuienEsBeatriz y porque la @CorteIDH quiere c... NaN 8.0 7.0 0.0 ... NaN corteidh 190706828 quienesbeatriz NaN NaN NaN Colombia 2023-03-01 17:22:56
171421 from:UnidosxlaVidaCo 1.630982e+18 1677691337 2023-03-01T17:22:17 UnidosxlaVidaCo Una nueva intervención de la @CorteIDH para im... NaN 6.0 4.0 0.0 ... NaN corteidh 190706828 quienesbeatriz NaN NaN NaN Colombia 2023-03-01 17:22:17
171422 from:UnidosxlaVidaCo 1.630981e+18 1677691259 2023-03-01T17:20:59 UnidosxlaVidaCo #QuienEsBeatriz « LideresXlaVida: Beatriz Vs. ... NaN 6.0 6.0 0.0 ... NaN NaN NaN quienesbeatriz NaN NaN NaN Colombia 2023-03-01 17:20:59

3 rows × 64 columns

Dominios

Lista del top 20 de dominios mencionados en los tweets y su frecuencia:

Code
domains = df['domains'].value_counts()
top_domains = domains.nlargest(20)
top_domains
domains
fb.me                     1231
bit.ly                     242
unidosporlavida.com        193
facebook.com               171
instagram.com              125
sumall.com                  98
youtube.com                 68
lifenews.com                40
citizengo.org               36
youtu.be                    33
20ft.net                    33
aciprensa.com               19
votocatolico.co             18
actuall.com                 15
shar.es                     15
liveactionnews.org          15
twitter.com                 12
es.gaudiumpress.org         12
religionenlibertad.com      10
razonmasfe.com               8
Name: count, dtype: int64

Hashtags

Lista del top 20 de hashtags más usados y su frecuencia

Code
hashtags = df['hashtags'].to_list()
# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]
# split items into a list based on a delimiter
hashtags = [x.split('|') for x in hashtags]
# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]
# count items on list
hashtags_count = pd.Series(hashtags).value_counts()
top_hashtags = hashtags_count.nlargest(20)
top_hashtags
sialavida                647
aborto                   416
9marchaxlavida           373
noalaborto               325
colombiaesprovida        295
eutanasia                157
procuradorordóñez        139
sialprocurador           138
yosoyprovida             135
soyprovida               108
negocio                  106
repost                   100
todavidaimporta          100
elijolas2vidas            98
colombia                  93
eutanasiano               91
abortocero                91
fiestaxlavida             91
4mayo7marchaporlavida     89
caravanaporlavida         88
Name: count, dtype: int64

Usuarios

Top 20 de usuarios más mencionados en los tweets

Code
# filter column
users = df['mentioned_names'].to_list()

# remove nan items from list
users = [x for x in users if not pd.isna(x)]

# split items into a list based on a delimiter
users = [x.split('|') for x in users]

# flatten list of lists
users = [item for sublist in users for item in sublist]

# count items on list
users_count = pd.Series(users).value_counts()

# return first n rows in descending order
top_users = users_count.nlargest(20)

top_users
marceposada        196
colombiaprovida    194
cconstitucional    176
monicaroa          173
sialprocurador     106
unidosxlavidaco    105
noticiasrcn         83
7marcofidelr        62
amadarosa           59
referendoxvida      51
colombiaderecha     49
profamiliacol       48
oea_oficial         47
comisionprimera     42
camaracolombia      40
lam_vero            36
wradiocolombia      35
unidosxlavida       35
yosoyprovida        34
aciprensa           32
Name: count, dtype: int64

Likes a lo largo del tiempo

Code
# plot the data using plotly
fig = px.line(df, 
              x='date', 
              y='like_count', 
              title='Número de likes en el tiempo',
              template='plotly_white', 
              hover_data=['text'])

# show the plot
fig.show()

Tokens

Lista del top 20 de los tokens más comunes y su frecuencia

Code
# load the spacy model for Spanish
nlp = spacy.load("es_core_news_sm")

STOP_WORDS = nlp.Defaults.stop_words

def filter_stopwords(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
    return ' '.join(tokens)

df['preprocess'] = df['text'].apply(filter_stopwords)
token_counts = df["preprocess"].str.split(expand=True).stack().value_counts()[:20]

token_counts
vida                 2070
aborto               1097
colombia              719
sialavida             661
colombiaesprovida     437
mayo                  390
q                     388
noalaborto            370
eutanasia             323
derecho               323
gracias               309
provida               308
muerte                268
feliz                 268
d                     263
voz                   250
mujer                 222
familia               210
mujeres               204
concepción            191
Name: count, dtype: int64

Hora

Lista de las 10 horas con más cantidad de tweets publicados

Code
df['hour'] = df['time'].str.split(":").str[0]
hours_count = df['hour'].value_counts()
top_hours = hours_count.nlargest(10)
top_hours
hour
16    786
15    737
17    677
14    622
19    525
18    519
13    519
12    448
00    426
20    403
Name: count, dtype: int64

Pataformas

Plataformas desde las que se publicaron contenidos y su frecuencia

Code
df['source_name'].value_counts()
source_name
Twitter for iPhone             2031
Twitter Web App                1706
Twitter Web Client             1487
Facebook                       1468
Twitter for Android             412
Mobile Web                      163
TweetDeck                       133
erased88075                     131
Twitter for Websites            124
Instagram                        99
UberSocial for iPhone            22
Mobile Web (M2)                  12
iOS                              11
Twitter for Android Tablets      10
Twitter for Mac                   7
Tweeet! on iOS                    4
Hootsuite Inc.                    3
Buffer                            3
Hootsuite                         2
Twibbon                           1
Periscope                         1
Name: count, dtype: int64

Preprocesamiento

Code
# Remove urls
p.set_options(p.OPT.URL)
df['text_pre'] = df['text'].apply(lambda x: p.clean(x))

# Tokenize mentions
p.set_options(p.OPT.MENTION)
df['text_pre'] = df['text_pre'].apply(lambda x: p.tokenize(x))

# Replace emojis with descriptions
df['text_pre'] = df['text_pre'].apply(lambda x: demojize(x))

Tópicos

Code
docs = df['text_pre']
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)
Code
topic_model.visualize_topics()
Code
topic_model.update_topics(docs, n_gram_range=(1, 2))
Code
topic_model.reduce_topics(docs, nr_topics=11)
<bertopic._bertopic.BERTopic at 0x7f2b701119d0>
Code
topic_model.visualize_topics()
Code
tweets = UnidosxlaVidaCo['text_clean'].to_list()
timestamps = UnidosxlaVidaCo['local_time'].to_list()

topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)